Prediction model


In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR, LinearSVR
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import validation_curve
from sklearn.model_selection import learning_curve


/Users/Home/anaconda/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')

Import data

Might still need to clean up the files some after import


In [2]:
path = '../Clean Data'
X_fn = 'X.csv'
y_fn = 'y.csv'
X_path = os.path.join(path, X_fn)
y_path = os.path.join(path, y_fn)

X = pd.read_csv(X_path)
y = pd.read_csv(y_path)

In [3]:
X.head()


Out[3]:
Unnamed: 0 cluster_id_6 Year nameplate_capacity DATETIME GROSS LOAD (MW) ERCOT Load, MW Total Wind Installed, MW Total Wind Output, MW Wind Output, % of Installed Wind Output, % of Load 1-hr MW change 1-hr % change Net Load (MW) Net Load Change (MW) Month NG Price ($/mcf) All coal Lignite Subbituminous
0 0 0 2007 5949.0 2007-01-01 00:00:00 4596.0 30428.0 2790.0 1074.0 38.494624 3.529644 NaN NaN 29354.0 NaN 1 6.42 25.1475 20.0275 28.115
1 1 0 2007 5949.0 2007-01-01 01:00:00 4566.0 30133.0 2790.0 922.6 33.068100 3.061760 -151.4 -14.096834 29210.4 -143.6 1 6.42 25.1475 20.0275 28.115
2 2 0 2007 5949.0 2007-01-01 02:00:00 4667.0 29941.0 2790.0 849.2 30.437276 2.836245 -73.4 -7.955777 29091.8 -118.6 1 6.42 25.1475 20.0275 28.115
3 3 0 2007 5949.0 2007-01-01 03:00:00 4668.0 29949.0 2790.0 1056.3 37.860215 3.526996 207.1 24.387659 28892.7 -199.1 1 6.42 25.1475 20.0275 28.115
4 4 0 2007 5949.0 2007-01-01 04:00:00 4685.0 30248.0 2790.0 837.1 30.003584 2.767456 -219.2 -20.751680 29410.9 518.2 1 6.42 25.1475 20.0275 28.115

Rename the cluster column to just cluster. This won't be needed once we export from the group classification with the correct column name


In [3]:
X.rename(columns={'cluster_id_6':'cluster'}, inplace=True)

Ratio of prices

Divide coal prices by natural gas price, drop natural gas price


In [4]:
for fuel in ['All coal', 'Lignite', 'Subbituminous']:
    X.loc[:,fuel] = X.loc[:,fuel].values/X.loc[:,'NG Price ($/mcf)'].values
    
X.drop('NG Price ($/mcf)', axis=1, inplace=True)

One-hot encoding of the cluster variable

I'm trying to make this easy for using with different numbers of clusters


In [5]:
cluster_ids = X['cluster'].unique()
for cluster in cluster_ids:
    X['cluster_{}'.format(cluster)] = np.eye(len(cluster_ids))[X['cluster'],cluster]

In [6]:
X.head()


Out[6]:
Unnamed: 0 cluster Year nameplate_capacity DATETIME GROSS LOAD (MW) ERCOT Load, MW Total Wind Installed, MW Total Wind Output, MW Wind Output, % of Installed ... Month All coal Lignite Subbituminous cluster_0 cluster_1 cluster_2 cluster_3 cluster_4 cluster_5
0 0 0 2007 5949.0 2007-01-01 00:00:00 4596.0 30428.0 2790.0 1074.0 38.494624 ... 1 3.917056 3.119548 4.379283 1.0 0.0 0.0 0.0 0.0 0.0
1 1 0 2007 5949.0 2007-01-01 01:00:00 4566.0 30133.0 2790.0 922.6 33.068100 ... 1 3.917056 3.119548 4.379283 1.0 0.0 0.0 0.0 0.0 0.0
2 2 0 2007 5949.0 2007-01-01 02:00:00 4667.0 29941.0 2790.0 849.2 30.437276 ... 1 3.917056 3.119548 4.379283 1.0 0.0 0.0 0.0 0.0 0.0
3 3 0 2007 5949.0 2007-01-01 03:00:00 4668.0 29949.0 2790.0 1056.3 37.860215 ... 1 3.917056 3.119548 4.379283 1.0 0.0 0.0 0.0 0.0 0.0
4 4 0 2007 5949.0 2007-01-01 04:00:00 4685.0 30248.0 2790.0 837.1 30.003584 ... 1 3.917056 3.119548 4.379283 1.0 0.0 0.0 0.0 0.0 0.0

5 rows × 25 columns


In [8]:
X.tail()


Out[8]:
Unnamed: 0 cluster Year nameplate_capacity DATETIME GROSS LOAD (MW) ERCOT Load, MW Total Wind Installed, MW Total Wind Output, MW Wind Output, % of Installed ... Month All coal Lignite Subbituminous cluster_0 cluster_1 cluster_2 cluster_3 cluster_4 cluster_5
473329 473329 5 2015 11476.0 2015-12-31 19:00:00 7516.0 39908.77734 16170.0 3824.932373 23.654498 ... 12 12.793722 10.780269 14.596413 0.0 0.0 0.0 0.0 0.0 1.0
473330 473330 5 2015 11476.0 2015-12-31 20:00:00 6552.0 38736.85938 16170.0 4625.632813 28.606264 ... 12 12.793722 10.780269 14.596413 0.0 0.0 0.0 0.0 0.0 1.0
473331 473331 5 2015 11476.0 2015-12-31 21:00:00 5944.0 37587.70313 16170.0 4957.714844 30.659956 ... 12 12.793722 10.780269 14.596413 0.0 0.0 0.0 0.0 0.0 1.0
473332 473332 5 2015 11476.0 2015-12-31 22:00:00 5698.0 36356.26172 16170.0 4699.097656 29.060592 ... 12 12.793722 10.780269 14.596413 0.0 0.0 0.0 0.0 0.0 1.0
473333 473333 5 2015 11476.0 2015-12-31 23:00:00 5365.0 35150.33984 16170.0 4313.125000 26.673624 ... 12 12.793722 10.780269 14.596413 0.0 0.0 0.0 0.0 0.0 1.0

5 rows × 25 columns


In [9]:
y.tail()


Out[9]:
Unnamed: 0 DATETIME cluster_id_6 Gen Change (MW)
473329 473329 2015-12-31 19:00:00 5 -20.0
473330 473330 2015-12-31 20:00:00 5 -964.0
473331 473331 2015-12-31 21:00:00 5 -608.0
473332 473332 2015-12-31 22:00:00 5 -246.0
473333 473333 2015-12-31 23:00:00 5 -333.0

Drop unnecessary columns and replace nan's with 0


In [7]:
X_cols = ['nameplate_capacity', 'GROSS LOAD (MW)', 'ERCOT Load, MW',
          'Total Wind Installed, MW', 'Total Wind Output, MW', 'Net Load Change (MW)',
          'All coal', 'Lignite', 'Subbituminous']
X_cluster_cols = ['cluster_{}'.format(cluster) for cluster in cluster_ids]

X_clean = X.loc[:,X_cols+X_cluster_cols]
X_clean.fillna(0, inplace=True)

y_clean = y.loc[:,'Gen Change (MW)']
y_clean.fillna(0, inplace=True)

In [11]:
print X_clean.shape
print y_clean.shape


(473334, 15)
(473334,)

Split into training, validation, testing


In [8]:
X_train = X_clean.loc[(X['Year']<2012),:]
y_train = y_clean.loc[(X['Year']<2012)]

X_va = X_clean.loc[X['Year'].isin([2012, 2013]),:]
y_va = y_clean.loc[X['Year'].isin([2012, 2013])]

X_test = X_clean.loc[X['Year']>2013,:]
y_test = y_clean.loc[X['Year']>2013]

Somehow we're missing 2 records from X_va


In [13]:
print X_va.shape, y_va.shape


(105264, 15) (105264,)

Need scaled versions of the X data for some of the models


In [9]:
X_train_scaled = StandardScaler().fit_transform(X_train)
X_va_scaled = StandardScaler().fit_transform(X_va)
X_test_scaled = StandardScaler().fit_transform(X_test)

Check size of all arrays


In [15]:
print X_train_scaled.shape, y_train.shape
print X_va_scaled.shape, y_va.shape
print X_test_scaled.shape, y_test.shape


(262944L, 16L) (262944L,)
(105264L, 16L) (105264L,)
(105126L, 16L) (105126L,)

Linear Regression (OLS)


In [10]:
lm = LinearRegression()
lm.fit(X_train_scaled, y_train)


Out[10]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [11]:
lm.score(X_va_scaled, y_va)


Out[11]:
0.26678565336933879

In [12]:
y_pr = lm.predict(X_va_scaled)

In [13]:
y_va.values.shape, y_pr.shape, X.loc[X['Year'].isin([2012, 2013]),'cluster'].values.shape


Out[13]:
((105264,), (105264,), (105264,))

In [14]:
y_lm_resids = pd.DataFrame(dict(zip(['Gen Change (MW)', 'y_pr', 'cluster'],
                               [y_va.values, y_pr, X.loc[X['Year'].isin([2012, 2013]),'cluster'].values])))
# y_lm_resids['y_pr'] = y_pr
# y_lm_resids['cluster'] = X.loc[:,'cluster']

In [21]:
y_lm_resids.head()


Out[21]:
Gen Change (MW) cluster y_pr
0 0.0 0 -56.223127
1 1.0 0 -15.262189
2 -1.0 0 -12.223127
3 0.0 0 -9.316877
4 0.0 0 25.909686

In [15]:
y_lm_resids.loc[:,'residuals'] = y_lm_resids.loc[:,'y_pr'] - y_lm_resids.loc[:,'Gen Change (MW)']

In [36]:
with sns.axes_style('whitegrid'):
    g = sns.FacetGrid(y_lm_resids, hue='cluster', col='cluster',
                      col_wrap=3)
    g.map(plt.scatter, 'y_pr', 'residuals', s=5, alpha=0.3)
    g.set_xlabels(size=15)
    g.set_ylabels(size=15)
    plt.savefig('OLS residuals.pdf')


Out[36]:
<seaborn.axisgrid.FacetGrid at 0x12463e290>
Out[36]:
<seaborn.axisgrid.FacetGrid at 0x12463e290>
Out[36]:
<seaborn.axisgrid.FacetGrid at 0x12463e290>

LinearSVR

The rbf kernal of SVR is very slow with this large of a dataset. If we want to try using it, should probably subsample.

Shows use with GridSearchCV if you guys want to use it


In [20]:
svm = LinearSVR()

In [25]:
parameters = {'C':np.logspace(-5, 3, num=15)}

In [26]:
lm = GridSearchCV(svm, parameters, n_jobs=-1, verbose=3)

Run the LinearSVR with gridsearch over the 15 parameter values. GridSearchCV does 3-fold CV by default.


In [27]:
results = lm.fit(X_train_scaled, y_train)


Fitting 3 folds for each of 15 candidates, totalling 45 fits
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:  1.0min finished

Validation Curve


In [25]:
param_range = np.logspace(-5, 3, num=13)
train_scores, valid_scores = validation_curve(LinearSVR(), X_train_scaled, y_train, 
                                              "C", param_range, n_jobs=-1)

In [26]:
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
valid_scores_mean = np.mean(valid_scores, axis=1)
valid_scores_std = np.std(valid_scores, axis=1)


plt.title("Validation Curve - LinearSVR", size=15)
plt.xlabel("C", size=15)
plt.ylabel("Score", size=15)
plt.ylim(0.0, .2)
lw = 2
plt.semilogx(param_range, train_scores_mean, label="Training score",
             color="darkorange", lw=lw)
plt.fill_between(param_range, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.2,
                 color="darkorange", lw=lw)
plt.semilogx(param_range, valid_scores_mean, label="Cross-validation score",
             color="navy", lw=lw)
plt.fill_between(param_range, valid_scores_mean - valid_scores_std,
                 valid_scores_mean + valid_scores_std, alpha=0.2,
                 color="navy", lw=lw)
plt.legend(loc="best")
plt.savefig('LinearSVR validation curve.pdf', bbox_inches='tight')


Out[26]:
<matplotlib.text.Text at 0x11af1c910>
Out[26]:
<matplotlib.text.Text at 0x1244beb10>
Out[26]:
<matplotlib.text.Text at 0x1194b3690>
Out[26]:
(0.0, 0.2)
Out[26]:
[<matplotlib.lines.Line2D at 0x11b9d3350>]
Out[26]:
<matplotlib.collections.PolyCollection at 0x11b9d3310>
Out[26]:
[<matplotlib.lines.Line2D at 0x1194a3cd0>]
Out[26]:
<matplotlib.collections.PolyCollection at 0x117583a90>
Out[26]:
<matplotlib.legend.Legend at 0x11758e250>

Learning Curve


In [28]:
# xLen = len(X_train_scaled)
# tSize = [.1, .2, .3, .4, .5, .6, .7, .8, .9, 1]
train_sizes, train_scores, valid_scores = learning_curve(LinearSVR(), X_train_scaled, 
                                                         y_train, n_jobs=-1)

In [33]:
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
valid_scores_mean = np.mean(valid_scores, axis=1)
valid_scores_std = np.std(valid_scores, axis=1)

plt.title("Learning Curve - LinearSVR", size=15)
plt.xlabel("Training Size", size=15)
plt.ylabel("Score", size=15)
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.1,
                 color="darkorange")
plt.fill_between(train_sizes, valid_scores_mean - valid_scores_std,
                 valid_scores_mean + valid_scores_std, alpha=0.1, color="navy")
plt.plot(train_sizes, train_scores_mean, 'o-', color="darkorange",
         label="Training score")
plt.plot(train_sizes, valid_scores_mean, 'o-', color="navy",
         label="Cross-validation score")

plt.legend(loc="best")
plt.savefig('LinearSVR learning curve.pdf', bbox_inches='tight')


Out[33]:
<matplotlib.text.Text at 0x124a4be50>
Out[33]:
<matplotlib.text.Text at 0x11b71e550>
Out[33]:
<matplotlib.text.Text at 0x1241ca5d0>
Out[33]:
<matplotlib.collections.PolyCollection at 0x11d682090>
Out[33]:
<matplotlib.collections.PolyCollection at 0x11d682190>
Out[33]:
[<matplotlib.lines.Line2D at 0x124b495d0>]
Out[33]:
[<matplotlib.lines.Line2D at 0x124b49b90>]
Out[33]:
<matplotlib.legend.Legend at 0x124b49d10>

cv_results_ returns a dictionary with all of the results and parameters


In [28]:
results.cv_results_


Out[28]:
{'mean_fit_time': array([  0.29466669,   0.31133342,   0.53800003,   0.65266665,
          0.80899994,   0.78433331,   0.81199996,   0.87266668,
          0.9866666 ,   1.28399992,   2.15866669,   4.79399999,
         11.95500008,  25.98933331,  43.23566675]),
 'mean_score_time': array([ 0.03633332,  0.03933334,  0.08633327,  0.01066677,  0.0126667 ,
         0.00999999,  0.01366663,  0.01433341,  0.01833336,  0.01333332,
         0.01299993,  0.01333332,  0.01366663,  0.01099992,  0.00766659]),
 'mean_test_score': array([ 0.00287043,  0.00987067,  0.03003211,  0.07128326,  0.11854968,
         0.1501322 ,  0.1644035 ,  0.16960687,  0.17144693,  0.17192847,
         0.17160952,  0.17128199,  0.17042826,  0.17088725,  0.16581553]),
 'mean_train_score': array([ 0.00287357,  0.00988178,  0.03007964,  0.07148156,  0.11913756,
         0.1513134 ,  0.16617615,  0.17197534,  0.17427679,  0.17499071,
         0.17536349,  0.17557742,  0.17513131,  0.17544765,  0.17128382]),
 'param_C': masked_array(data = [1.0000000000000001e-05 3.7275937203149381e-05 0.00013894954943731373
  0.0005179474679231213 0.0019306977288832496 0.0071968567300115137
  0.026826957952797246 0.10000000000000001 0.37275937203149379
  1.3894954943731359 5.1794746792312019 19.306977288832496 71.96856730011514
  268.26957952797216 1000.0],
              mask = [False False False False False False False False False False False False
  False False False],
        fill_value = ?),
 'params': ({'C': 1.0000000000000001e-05},
  {'C': 3.7275937203149381e-05},
  {'C': 0.00013894954943731373},
  {'C': 0.0005179474679231213},
  {'C': 0.0019306977288832496},
  {'C': 0.0071968567300115137},
  {'C': 0.026826957952797246},
  {'C': 0.10000000000000001},
  {'C': 0.37275937203149379},
  {'C': 1.3894954943731359},
  {'C': 5.1794746792312019},
  {'C': 19.306977288832496},
  {'C': 71.96856730011514},
  {'C': 268.26957952797216},
  {'C': 1000.0}),
 'rank_test_score': array([15, 14, 13, 12, 11, 10,  9,  7,  3,  1,  2,  4,  6,  5,  8]),
 'split0_test_score': array([ 0.00301764,  0.01030828,  0.03116889,  0.07332238,  0.12036935,
         0.15154889,  0.16511589,  0.1696596 ,  0.17094722,  0.17212156,
         0.17179597,  0.17210909,  0.17093621,  0.1711447 ,  0.17189868]),
 'split0_train_score': array([ 0.00295224,  0.01009817,  0.03066308,  0.07290477,  0.12169576,
         0.15579254,  0.17192734,  0.17853343,  0.1809065 ,  0.18234366,
         0.18280627,  0.18269438,  0.18187619,  0.18167064,  0.1834368 ]),
 'split1_test_score': array([ 0.00296258,  0.01014405,  0.03052811,  0.07106492,  0.11597412,
         0.14511882,  0.15822912,  0.1627618 ,  0.16480322,  0.16443405,
         0.16456281,  0.16445884,  0.16459095,  0.16665469,  0.15697647]),
 'split1_train_score': array([ 0.00304868,  0.01044205,  0.0314778 ,  0.0736199 ,  0.12111131,
         0.15264044,  0.16698589,  0.17228259,  0.17506954,  0.17496312,
         0.1752573 ,  0.17530995,  0.17516148,  0.17714316,  0.16690628]),
 'split2_test_score': array([ 0.00263108,  0.00915966,  0.02839933,  0.06946248,  0.11930557,
         0.15372889,  0.16986548,  0.1763992 ,  0.17859036,  0.17922978,
         0.17846979,  0.17727803,  0.17575762,  0.17486237,  0.16857143]),
 'split2_train_score': array([ 0.00261979,  0.00910511,  0.02809803,  0.06792001,  0.11460562,
         0.14550722,  0.15961522,  0.16510999,  0.16685434,  0.16766536,
         0.1680269 ,  0.16872794,  0.16835626,  0.16752916,  0.1635084 ]),
 'std_fit_time': array([  1.24723493e-03,   1.64992388e-02,   1.03405361e-01,
          1.92238952e-02,   2.90631870e-02,   1.26376020e-01,
          5.12314671e-02,   7.84064085e-02,   1.09332276e-01,
          1.44589107e-01,   2.24553307e-01,   4.04309506e-02,
          2.89872308e-01,   5.01694043e-01,   1.52403465e+00]),
 'std_score_time': array([ 0.00402773,  0.0037713 ,  0.08738544,  0.00235708,  0.00169974,
         0.        ,  0.00309114,  0.00339932,  0.00825964,  0.00094274,
         0.00141417,  0.00169974,  0.00169955,  0.00163307,  0.00047137]),
 'std_test_score': array([ 0.00017074,  0.00050721,  0.00118382,  0.00158335,  0.00187226,
         0.00365501,  0.00477716,  0.00556757,  0.00563966,  0.00604188,
         0.00567903,  0.00526599,  0.0045729 ,  0.00335571,  0.00639606]),
 'std_train_score': array([ 0.00018371,  0.00056685,  0.00144014,  0.00253526,  0.00321344,
         0.00430254,  0.00505891,  0.0054844 ,  0.00576409,  0.00599242,
         0.00603412,  0.00570491,  0.00551953,  0.00589641,  0.00870469])}

In [29]:
test_score = results.cv_results_['mean_test_score']
train_score = results.cv_results_['mean_train_score']

In [30]:
C = [results.cv_results_['params'][x]['C'] for x in range(15)]

In [31]:
C


Out[31]:
[1.0000000000000001e-05,
 3.7275937203149381e-05,
 0.00013894954943731373,
 0.0005179474679231213,
 0.0019306977288832496,
 0.0071968567300115137,
 0.026826957952797246,
 0.10000000000000001,
 0.37275937203149379,
 1.3894954943731359,
 5.1794746792312019,
 19.306977288832496,
 71.96856730011514,
 268.26957952797216,
 1000.0]

Plot the score for each value of C


In [32]:
plt.semilogx(C, test_score, C, train_score)
plt.legend(['test_score', 'train_score'])


Out[32]:
<matplotlib.legend.Legend at 0x121b05f8>

SVR with rbf

Try subsampling?


In [33]:
X_train_scaled.shape


Out[33]:
(262944L, 16L)

In [38]:
idx = np.random.choice(np.arange(len(X_train_scaled)), size=int(len(X_train_scaled)*0.5), replace=False)

In [39]:
lm = SVR()

In [40]:
lm.fit(X_train_scaled[idx], y_train[idx])


Out[40]:
SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [41]:
lm.score(X_va_scaled, y_va)


Out[41]:
0.28122217024273011

Learning Curve


In [69]:
xLen = len(X_train_scaled)
tSize = [.1, .2, .3, .4, .5]
train_sizes, train_scores, valid_scores = learning_curve(SVR(), X_train_scaled, y_train, train_sizes = tSize, n_jobs = -1, verbose = 3)

train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
valid_scores_mean = np.mean(valid_scores, axis=1)
valid_scores_std = np.std(valid_scores, axis=1)

plt.grid()

plt.title("Validation Curve - SVR")
plt.fill_between(tSize, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.1,
                 color="darkorange")
plt.fill_between(tSize, valid_scores_mean - valid_scores_std,
                 valid_scores_mean + valid_scores_std, alpha=0.1, color="navy")
plt.plot(tSize, train_scores_mean, 'o-', color="darkorange",
         label="Training score")
plt.plot(tSize, valid_scores_mean, 'o-', color="navy",
         label="Cross-validation score")

plt.legend(loc="best")
plt.show


[learning_curve] Training set sizes: [17529 35059 52588 70118 87648]
[Parallel(n_jobs=-1)]: Done   4 out of  15 | elapsed:  7.0min remaining: 19.2min
[Parallel(n_jobs=-1)]: Done  10 out of  15 | elapsed: 20.1min remaining: 10.0min
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed: 42.0min finished
Out[69]:
<function matplotlib.pyplot.show>